
rm(list=ls())

# get user's m1
m1 <- Sys.getenv("LOGNAME")

# indicator for sub-programs for whether the entire program has been run
TEMPDISAGG <- 1

# Temporarily suppress warning messages. 
oldw <- getOption("warn")
options(warn = -1)

# Set Parameters ----------------------------------------------------------

group <- "networth" #### Options: networth, race, education, income, urban, generation, age, homeown, htm (hand to mouth)
tempdisagg_method <- "fernandez" # Here is where the different methods can be selected, baseline is fernandez; Options: chow-lin-maxlog, chow-lin-minrss-ecotrim, chow-lin-minrss-quilis, kalman
# fernandez, litterman-maxlog, litterman-minrss, kalman (uses separate kalman.R script for disaggregation, not tempdisagg package)
trunc.rho <- -1 # Only relevant for chowlin methods. If set to 0 (default), no negative values are allowed. If set to -1, truncation is disabled.
agg_level <- "bus_cycle2" ## Only relevant for networth and income. Options: default, high, low, bus_cycle, bus_cycle2, all, quint, expanded, expanded_2
scf_ignore <- NULL ## For out of sample testing. 
filetag <- NULL # optional way to add additional string to file name 
lastqtr <- NULL # option to only produce data through a certain date 

b101h_db <- "fof"  # where you want the b101h numbers to come from (fof or fofnet)

# Set directories ---------------------------------------------------------

home_directory = "[your home directory]" # change this to your home directory
scf_directory <- file.path(home_directory, "/inputs") # where the files from MEcS are located
output_directory <- file.path(home_directory, "/outputs") # where the program will output the resulting dfa levels and shares

# Load functions ----------------------------------------------------------

source(file.path(master_scripts_directory, "functions.R"))

if (b101h_db == "fofnet") {
  fofnet_path <- get_fofnet(message = FALSE)
  b101h_db <- fofnet_path
}


library(tempdisagg)

# use a random series from the chosen fof database to get the most recent quarter of data available 
date_series <- fame2df("PC073164013.Q", b101h_db)
if (is.null(lastqtr)) {
  lastqtr <- max(date_series$date)
} else {
  lastqtr <- as.Date(as.yearqtr(lastqtr), frac=1)
}

settings <- dfa_splits(group = group, agg_level = agg_level)
groupnames <- settings[["groupnames"]]
scf.recode <- settings[["scf.recode"]]
input.file <- settings[["input.file"]]
tag <- settings[["tag"]]

## Series mnemonics used in FA B.101.h

series <- c(real_estate = "LM155035015.Q", 
            con_durables = "LM155111005.Q", 
            check_curr = "FL193020005.Q", 
            time_dep = "FL193030205.Q", 
            mmfs = "FL193034005.Q",  
            us_gov_muni = "FL193061005.Q",
            corp_for_bonds = "FL193063005.Q", 
            oth_loans = "FL153067005.Q",
            morts_owed = "FL193065005.Q", 
            corp_equ_mfs = "LM193064005.Q", 
            life_ins = "FL153040005.Q", 
            pension = "FL153050005.Q", 
            non_corp_eq = "LM152090205.Q", 
            misc_assets = "FL153090005.Q", 
            morts = "FL153165105.Q", 
            cons_cred = "FL153166000.Q", 
            dep_loans = "FL193168005.Q", 
            oth_loans_adv = "FL193169005.Q", 
            unpaid_life_ins = "FL543077073.Q")

assets <- names(series)


# Read in and clean SCF data ----------------------------------------------

## Identify file path to Excel files
print("Reading in SCF data")

filepath <- file.path(scf_directory, paste0(input.file, ".xlsx"))

## Make SCF data frame using the function created
scf_data <- scf_balance_sheet(excel = filepath, hh = TRUE, income = FALSE)

sheet_names <- excel_sheets(filepath)
sheet_names <- sapply(strsplit(sheet_names," "), `[`, 1)


scf_data <- scf_data %>%
  dplyr::filter(date <= as.yearqtr(lastqtr)) %>% # filter out SCF observations that are after the last quarter you want to forecast 
  mutate(cat = recode(cat, !!!scf.recode)) %>% # relabel groups from MEcS files
  mutate(cat = factor(cat, levels = groupnames)) %>% # make the groups a factor instead of character for better organization
  group_by(date,cat) %>%
  dplyr::summarise_all(sum) %>% # add up all the groups within each group we are looking to distribute (e.g. Bottom50 = 1 through 5 in the wealth file)
  arrange(date,cat) %>%
  ungroup() 

write_csv(scf_data, dfa_file_names(item = "scf_levels_test", group = group, agg_level = agg_level, method = tempdisagg_method, filetag = filetag, scf_ignore = scf_ignore))

# put at per household level
scf_data <- scf_data %>%
  mutate_at(vars(-date, -cat, -hhcount), ~./hhcount) %>% 
  select(-hhcount)

## If you want to interpolate over / forecast an SCF, do that here by simply removing it (pretending you don't have the observation).

if (!is.null(scf_ignore)) {
  scf_ignore <- as.yearqtr(scf_ignore)
  scf_data <- dplyr::filter(scf_data, date != scf_ignore)
}

# Make sequence of dates you want in the resulting distribution
interpqtrs <- as.yearqtr(seq.Date(as.Date("1989-09-30"),lastqtr, by = "quarter"))

# Read in, interpolate, and extrapolate household counts ------------------
# in a separate script because the extrapolation is complicated to code
source(file.path(master_scripts_directory, "tempdisagg_hhcount.R"))


# Make list of data frames: one data frame for each group because we interpolate them separately 
scf_data_list <- split(scf_data, f = scf_data$cat)

# Compile indicator series ------------------------------------------------------
print("Compiling indicator series")
# Indicator series for DFA interpolation and extrapolation

indicator_choices <- list(
  real_estate = c("fhfa", "homeown"),
  con_durables =  c("snp500", "homeown"),
  check_curr = c("snp500", "realff"),
  time_dep = c("realff","nysevol"),
  mmfs = c("realff"),
  us_gov_muni = c("snp500", "realff"),
  corp_for_bonds = c("snp500", "realff"),
  oth_loans = c("snp500", "realff"),
  morts_owed = c(),
  corp_equ_mfs = c("realff"),
  life_ins = c("snp500", "realff"),
  pension = c("snp500", "realff", "dbdc"),
  non_corp_eq = c("realff","noncorp_re_res", "noncorp_re_nonres"),
  misc_assets = c("snp500", "realff", "nysevol"),
  morts = c("dpi3"),
  cons_cred = c("dpi3", "stuloans"),
  dep_loans = c(),
  oth_loans_adv = c("dpi3"),
  unpaid_life_ins = c("snp500", "dpi3"),
  inc = c("realff"),
  nipainc = c("realff"),
  norminc = c("realff")
)

## US Database

us_series <- list(snp500="PJSPCC_N.B", 
                  fhfa = "PJHERH_N.Q",
                  homeown = "HUPO_N.Q", 
                  cpi = "PJCUE_N.M", 
                  fedfunds = "RIFSPFF_N.M", 
                  pop = "NPNC_N.M", 
                  dpi1 = "DTFC%YPD.Q", 
                  stuloans = "DTCTLNS_N.Q", 
                  vehloans = "DTCTLNV_N.Q", 
                  nysevol = "SMTVN_N.B", 
                  smallgains = "EJNF500T999_N.Q",
                  emp_pop = "LFCE%NPNC.M",
                  unemp_rate = "ruc.q")

us_series <- lapply(us_series, function(ser) dfa_getfame(ser, "us"))

## FOF database

fof_series <- c(
  db_pensions = "FL594190045.Q",
  dc_pensions = "FL594090055.Q",
  iras = "LM893131573.Q",
  morts = "LA153165105.Q",
  cons_cred = "LA153166000.Q",
  disposable_income = "FA156012005.Q",
  dpi2 = "FL010000346.Q",
  dpi4 = "FA156012005.Q",
  treas = "FI073161103.Q", #a series that is not publicly available
  noncorp_re = "LM115035005.Q",
  noncorp_re_nonres = "FL115035035.Q",
  noncorp_re_res = "FL115035023.Q",
  noncorp_re_reval = "FR115035005.Q"
  
)

fof_series <- lapply(fof_series, function(ser) dfa_getfame(ser, b101h_db))

## USEAIDA database

useaida_series <- c(
  lfpr = "lfpr.q",
  ru = "ru.q"
)

useaida_series <- lapply(useaida_series, function(ser) dfa_getfame(ser, "useaida"))

join_by_date <- function(x,y) left_join(x,y,by = "date")

indicator_series <- unlist(list(us_series, fof_series, useaida_series), recursive = FALSE)
indicators <- Reduce(join_by_date, indicator_series)
names(indicators) <- c("date", names(indicator_series))



indicators <- indicators %>%
  fill(names(.), .direction = "updown") %>%
  mutate(realff = fedfunds / cpi,
         dbdc = db_pensions / (dc_pensions + iras),
         dpi3 = (morts + cons_cred) / disposable_income,
         # linearly interpolating 2020q2 and 2020q3 for homeown and dpi3 due to lockdown data issues
         homeown = case_when(date == as.yearqtr("2020q2") ~ 65.47, date == as.yearqtr("2020q3") ~ 65.63, TRUE ~ homeown),
         dpi3 = case_when(date == as.yearqtr("2020q2") ~ 0.881185, date == as.yearqtr("2020q3") ~ 0.878199, date == as.yearqtr("2021q1") ~ 0.85026,TRUE ~ dpi3)) %>%
  # only keep the series that are selected to be used 
  select(date, all_of(unique(unlist(indicator_choices)))) 

# We use the aggregate line as an indicator series as well, but first need to put it at a per-household level

totalhh <- hhcount %>%
  pivot_longer(-date, names_to = "cat", values_to = "hhcount") %>%
  dplyr::group_by(date) %>%
  dplyr::summarise(hhcount = sum(hhcount))

agg <- dfa_getfame(series, database = b101h_db) %>%
  filter(date <= as.yearqtr(lastqtr)) 

agg_perhh <- agg %>%
  left_join(totalhh, by = "date") %>%
  mutate_at(vars(all_of(names(series))), ~. * 1e6/hhcount) %>%
  select(date, all_of(names(series)), hhcount)

## Here we are just combining general economic indicators with household counts as well as aggregate lines 

indicators <- left_join(indicators, agg_perhh, by = "date") %>%
  # divide some of the other indicator series to be on a per household basis as well
  mutate_at(vars(noncorp_re_res, noncorp_re_nonres, stuloans), ~(. * 1e6)/hhcount)

indicators_list <- list()

for (gr in names(scf_data_list)) {
  for (line in names(series)) {
    indicators_list[[gr]][[line]] <- select(indicators, date, all_of(c(indicator_choices[[line]], line))) %>%
      # scale indicator series to 1
      mutate_if(is.numeric, scale2one) %>%
      # filter out dates that are not included for the group (currently only needed for Millennials in the generation split)
      filter(date >= min(scf_data_list[[gr]][["date"]])) 
    
  }
}

# Temporal disagregation  --------------------------------------------------------------
print("Applying temporal disaggregation")

# if you want to use the Kalman filter/smoother, process occurs in a separate script because
# this is not an option in the tempdisagg package's td function that we use here 

if (tempdisagg_method == "kalman") {
  source(file.path(master_scripts_directory, "kalman.R"))
} else {
  
  # Put SCF data and indicator series in time series format to comply with tempdisagg package
  
  # SEF 8/24/20: There is a glitch in the td function from tempdisagg where it will complain that the indicators
  # are too short unless they cover the entire final three-year time period. That is why you append zeros 
  # at the end and just forecast extra quarters to appease the function, then trim them later. The
  # 10 zeros in the code below may have to change (next quarter I think it will be 9). 
  
  
  indicators_list <- lapply(indicators_list, function(gr) lapply(gr, function(line) lapply(unlist(group_split(line), recursive = FALSE), function (ind)
    ts(c(ind, rep(0,10)), start = c(year(min(line[["date"]])), 1), frequency = 4))))
  
  
  scf_data_list <- lapply(scf_data_list, function(gr) lapply(unlist(group_split(gr), recursive = FALSE), function(line)
    zoo::na.trim(ts(line, start = year(as.Date(min(gr[["date"]]))), deltat = 3))))
  
  ## this is where you actually apply the disaggregation
  
  scf_data_disagg <- list()
  
  for (gr in names(scf_data_list)) {
    for (line in names(series)) {
      group_start <- min(indicators_list[[gr]][[line]][["date"]])
      assign("y", scf_data_list[[gr]][[line]])
      for (ind in 1:length(indicators_list[[gr]][[line]])) {
        assign(paste0("x_", ind), indicators_list[[gr]][[line]][[ind]])
      }
      scf_data_disagg[[gr]][[line]] <- td(formula(paste0("y ~ ", paste0("x_", seq(2, length(indicators_list[[gr]][[line]])),collapse = "+"))),
                                          conversion = "first", to = "quarterly", method = tempdisagg_method, truncated.rho = trunc.rho
      )
    }
  }
  
  scf_data_imputed <- lapply(scf_data_disagg, function(gr) lapply(gr, function(asset) predict(asset)))
  
  # Convert away from time series and add the date back in
  
  for (gr in names(scf_data_imputed)) {
    for (line in names(series)) {
      date_col <- data.frame(date = time(scf_data_imputed[[gr]][[line]]))
      scf_data_imputed[[gr]][[line]] <- data.frame(cbind(date_col, as.numeric(scf_data_imputed[[gr]][[line]]))) %>%
        setNames(c("date", line))
    }
    scf_data_imputed[[gr]] <- Reduce(join_by_date, scf_data_imputed[[gr]])
  }
  
  ## Calculate standard errors
  
  # have not gotten standard errors to work on generation split 
  if (group != "generation") {
    source(file.path(master_scripts_directory, "tempdisagg_standard_errors.R"))
  }
  
}

setwd(output_directory)
# Create DFA levels and shares --------------------------------------------
print("Creating DFA levels and shares.")
scf_levels <- bind_rows(scf_data_imputed, .id = "cat") 

if (tempdisagg_method != "kalman") {
  scf_levels <- scf_levels %>% 
    mutate(date = as.yearqtr(date + .5)) # for ease of computation we "pretend" we start at 1989q1 so here we have to shift the dates back to their proper places
}

scf_levels <- scf_levels %>%
  mutate_if(is.numeric, neg_to_zero) %>% # we do not allow individual balance sheet lines to go negative 
  complete(date, cat) %>% # if a certain group does not exist for a particular date, add it in
  mutate(cat = factor(cat, levels = groupnames)) %>%
  arrange(date, cat) %>%
  replace(is.na(.), 0) %>% # replace NA values with zeroes 
  filter(date <= as.yearqtr(lastqtr))  # this is where you trim off the extra quarters we have to extrapolate to avoid the glitch in the td function

## this is still in per household terms--bring extrapolated hhcounts back in to scale up to the group's aggregate 

hhcount <- hhcount %>%
  pivot_longer(-date, names_to = "cat", values_to = "hhcount") 

scf_levels <- left_join(scf_levels, hhcount, by = c("date", "cat")) %>%
  mutate_at(vars(all_of(names(series))), ~.* hhcount) %>%
  select(-hhcount)

# Determine implied shares for each group from imputed level and implied aggregates 

scf_shares <- scf_levels %>%
  group_by(date) %>%
  mutate_if(is.numeric, ~./sum(.))

# Apply shares to b101h aggregates

dfa_levels <- left_join(
  pivot_longer(scf_shares, -c(date,cat), names_to = "asset", values_to = "share"),
  pivot_longer(agg, -date, names_to = "asset", values_to = "b101h"),
  by = c("date", "asset")
) %>%
  mutate(b101h = b101h * 1e6,
         level = share * b101h) %>%
  select(date, cat, asset, level) %>%
  pivot_wider(id_cols = c(date, cat), names_from = asset, values_from = level) %>%
  ## Create composite b101h variables
  mutate(non_fin = real_estate+con_durables, 
         debt_sec=us_gov_muni+corp_for_bonds,
         ass_loans=oth_loans+morts_owed,
         liab_loans = morts+cons_cred+dep_loans+oth_loans_adv) %>%
  mutate(fin = check_curr+time_dep+mmfs+debt_sec+ass_loans+corp_equ_mfs+life_ins+pension+non_corp_eq+misc_assets) %>%
  mutate(assets = non_fin+fin,
         liab = liab_loans+unpaid_life_ins) %>%
  mutate(networth = assets - liab) %>%
  select(date, cat, assets, non_fin, real_estate, con_durables, fin, check_curr, time_dep, mmfs, debt_sec, us_gov_muni,
         corp_for_bonds, ass_loans, oth_loans, morts_owed, corp_equ_mfs, life_ins, pension, non_corp_eq, misc_assets,
         liab, liab_loans, morts, cons_cred, dep_loans, oth_loans_adv, unpaid_life_ins, networth)

# Calculate dfa shares based on dfa levels 
dfa_shares <- dfa_levels %>%
  group_by(date) %>%
  mutate_if(is.numeric, ~./sum(.))


if (tempdisagg_method %in% c("chow-lin-maxlog", "chow-lin-minrss-ecotrim", "chow-lin-minrss-quilis")) {
  trunc_lab <- ifelse(trunc.rho == 0, "trunc", "notrunc")
  method_lab <- paste0(str_remove_all(tempdisagg_method, "-"), "_", trunc_lab)
} else {
  method_lab <- str_remove_all(tempdisagg_method, "-")
}


# Output results ----------------------------------------------------------
print(paste("Saving results to", output_directory))


setwd(output_directory)
write_csv(dfa_levels, dfa_file_names(item = "levels", group = group, agg_level = agg_level, method = method_lab, filetag = filetag, scf_ignore = scf_ignore))
write_csv(dfa_shares, dfa_file_names(item = "shares", group = group, agg_level = agg_level, method = method_lab, filetag = filetag, scf_ignore = scf_ignore))
write_csv(scf_levels, dfa_file_names(item = "scf_levels", group = group, agg_level = agg_level, method = method_lab, filetag = filetag, scf_ignore = scf_ignore))

# Output results to RSMA shiny folder -------------------------------------
print(paste("Saving results to", shiny_directory))


setwd(shiny_directory)
write_csv(dfa_levels, dfa_file_names(item = "levels", group = group, agg_level = agg_level, method = method_lab, filetag = filetag, scf_ignore = scf_ignore))
write_csv(dfa_shares, dfa_file_names(item = "shares", group = group, agg_level = agg_level, method = method_lab, filetag = filetag, scf_ignore = scf_ignore))
write_csv(scf_levels, dfa_file_names(item = "scf_levels", group = group, agg_level = agg_level, method = method_lab, filetag = filetag, scf_ignore = scf_ignore))

## Turn warnings back to whatever you had them as before running this script. 
options(warn = oldw)
